{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(14999, 10)\n",
      "['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'left', 'promotion_last_5years', 'sales', 'salary']\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import category_encoders as ce\n",
    "hr_data = pd.read_csv('data/hr.csv', header=0)\n",
    "hr_data.head()\n",
    "hr_data = hr_data.dropna()\n",
    "print(hr_data.shape)\n",
    "print(list(hr_data.columns))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>salary_0</th>\n",
       "      <th>salary_1</th>\n",
       "      <th>salary_2</th>\n",
       "      <th>salary_-1</th>\n",
       "      <th>satisfaction_level</th>\n",
       "      <th>last_evaluation</th>\n",
       "      <th>number_project</th>\n",
       "      <th>average_montly_hours</th>\n",
       "      <th>time_spend_company</th>\n",
       "      <th>Work_accident</th>\n",
       "      <th>left</th>\n",
       "      <th>promotion_last_5years</th>\n",
       "      <th>sales</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.38</td>\n",
       "      <td>0.53</td>\n",
       "      <td>2</td>\n",
       "      <td>157</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.80</td>\n",
       "      <td>0.86</td>\n",
       "      <td>5</td>\n",
       "      <td>262</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.11</td>\n",
       "      <td>0.88</td>\n",
       "      <td>7</td>\n",
       "      <td>272</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.72</td>\n",
       "      <td>0.87</td>\n",
       "      <td>5</td>\n",
       "      <td>223</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.37</td>\n",
       "      <td>0.52</td>\n",
       "      <td>2</td>\n",
       "      <td>159</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   salary_0  salary_1  salary_2  salary_-1  satisfaction_level  \\\n",
       "0         1         0         0          0                0.38   \n",
       "1         0         1         0          0                0.80   \n",
       "2         0         1         0          0                0.11   \n",
       "3         1         0         0          0                0.72   \n",
       "4         1         0         0          0                0.37   \n",
       "\n",
       "   last_evaluation  number_project  average_montly_hours  time_spend_company  \\\n",
       "0             0.53               2                   157                   3   \n",
       "1             0.86               5                   262                   6   \n",
       "2             0.88               7                   272                   4   \n",
       "3             0.87               5                   223                   5   \n",
       "4             0.52               2                   159                   3   \n",
       "\n",
       "   Work_accident  left  promotion_last_5years  sales  \n",
       "0              0     1                      0  sales  \n",
       "1              0     1                      0  sales  \n",
       "2              0     1                      0  sales  \n",
       "3              0     1                      0  sales  \n",
       "4              0     1                      0  sales  "
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "onehot_encoder = ce.OneHotEncoder(cols=['salary'])\n",
    "onehot_df = onehot_encoder.fit_transform(hr_data)\n",
    "onehot_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    7316\n",
       "1    6446\n",
       "2    1237\n",
       "Name: salary, dtype: int64"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ordinal_encoder = ce.OrdinalEncoder(cols=['salary'])\n",
    "ordinal_df = ordinal_encoder.fit_transform(hr_data)\n",
    "ordinal_df.head(10)\n",
    "ordinal_df['salary'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>salary_0</th>\n",
       "      <th>salary_1</th>\n",
       "      <th>satisfaction_level</th>\n",
       "      <th>last_evaluation</th>\n",
       "      <th>number_project</th>\n",
       "      <th>average_montly_hours</th>\n",
       "      <th>time_spend_company</th>\n",
       "      <th>Work_accident</th>\n",
       "      <th>left</th>\n",
       "      <th>promotion_last_5years</th>\n",
       "      <th>sales</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.38</td>\n",
       "      <td>0.53</td>\n",
       "      <td>2</td>\n",
       "      <td>157</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.80</td>\n",
       "      <td>0.86</td>\n",
       "      <td>5</td>\n",
       "      <td>262</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.11</td>\n",
       "      <td>0.88</td>\n",
       "      <td>7</td>\n",
       "      <td>272</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.72</td>\n",
       "      <td>0.87</td>\n",
       "      <td>5</td>\n",
       "      <td>223</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.37</td>\n",
       "      <td>0.52</td>\n",
       "      <td>2</td>\n",
       "      <td>159</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   salary_0  salary_1  satisfaction_level  last_evaluation  number_project  \\\n",
       "0         0         0                0.38             0.53               2   \n",
       "1         0         1                0.80             0.86               5   \n",
       "2         0         1                0.11             0.88               7   \n",
       "3         0         0                0.72             0.87               5   \n",
       "4         0         0                0.37             0.52               2   \n",
       "\n",
       "   average_montly_hours  time_spend_company  Work_accident  left  \\\n",
       "0                   157                   3              0     1   \n",
       "1                   262                   6              0     1   \n",
       "2                   272                   4              0     1   \n",
       "3                   223                   5              0     1   \n",
       "4                   159                   3              0     1   \n",
       "\n",
       "   promotion_last_5years  sales  \n",
       "0                      0  sales  \n",
       "1                      0  sales  \n",
       "2                      0  sales  \n",
       "3                      0  sales  \n",
       "4                      0  sales  "
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "binary_encoder = ce.BinaryEncoder(cols=['salary'])\n",
    "df_binary = binary_encoder.fit_transform(hr_data)\n",
    "df_binary.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col_salary_0</th>\n",
       "      <th>col_salary_1</th>\n",
       "      <th>col_salary_2</th>\n",
       "      <th>col_satisfaction_level</th>\n",
       "      <th>col_last_evaluation</th>\n",
       "      <th>col_number_project</th>\n",
       "      <th>col_average_montly_hours</th>\n",
       "      <th>col_time_spend_company</th>\n",
       "      <th>col_Work_accident</th>\n",
       "      <th>col_left</th>\n",
       "      <th>col_promotion_last_5years</th>\n",
       "      <th>col_sales</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>-7.071068e-01</td>\n",
       "      <td>0.408248</td>\n",
       "      <td>0.38</td>\n",
       "      <td>0.53</td>\n",
       "      <td>2</td>\n",
       "      <td>157</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>-4.433780e-17</td>\n",
       "      <td>-0.816497</td>\n",
       "      <td>0.80</td>\n",
       "      <td>0.86</td>\n",
       "      <td>5</td>\n",
       "      <td>262</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.0</td>\n",
       "      <td>-4.433780e-17</td>\n",
       "      <td>-0.816497</td>\n",
       "      <td>0.11</td>\n",
       "      <td>0.88</td>\n",
       "      <td>7</td>\n",
       "      <td>272</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.0</td>\n",
       "      <td>-7.071068e-01</td>\n",
       "      <td>0.408248</td>\n",
       "      <td>0.72</td>\n",
       "      <td>0.87</td>\n",
       "      <td>5</td>\n",
       "      <td>223</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.0</td>\n",
       "      <td>-7.071068e-01</td>\n",
       "      <td>0.408248</td>\n",
       "      <td>0.37</td>\n",
       "      <td>0.52</td>\n",
       "      <td>2</td>\n",
       "      <td>159</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   col_salary_0  col_salary_1  col_salary_2  col_satisfaction_level  \\\n",
       "0           1.0 -7.071068e-01      0.408248                    0.38   \n",
       "1           1.0 -4.433780e-17     -0.816497                    0.80   \n",
       "2           1.0 -4.433780e-17     -0.816497                    0.11   \n",
       "3           1.0 -7.071068e-01      0.408248                    0.72   \n",
       "4           1.0 -7.071068e-01      0.408248                    0.37   \n",
       "\n",
       "   col_last_evaluation  col_number_project  col_average_montly_hours  \\\n",
       "0                 0.53                   2                       157   \n",
       "1                 0.86                   5                       262   \n",
       "2                 0.88                   7                       272   \n",
       "3                 0.87                   5                       223   \n",
       "4                 0.52                   2                       159   \n",
       "\n",
       "   col_time_spend_company  col_Work_accident  col_left  \\\n",
       "0                       3                  0         1   \n",
       "1                       6                  0         1   \n",
       "2                       4                  0         1   \n",
       "3                       5                  0         1   \n",
       "4                       3                  0         1   \n",
       "\n",
       "   col_promotion_last_5years col_sales  \n",
       "0                          0     sales  \n",
       "1                          0     sales  \n",
       "2                          0     sales  \n",
       "3                          0     sales  \n",
       "4                          0     sales  "
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "poly_encoder = ce.PolynomialEncoder(cols=['salary'])\n",
    "df_poly = poly_encoder.fit_transform(hr_data)\n",
    "df_poly.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col_salary_0</th>\n",
       "      <th>col_salary_1</th>\n",
       "      <th>col_salary_2</th>\n",
       "      <th>col_satisfaction_level</th>\n",
       "      <th>col_last_evaluation</th>\n",
       "      <th>col_number_project</th>\n",
       "      <th>col_average_montly_hours</th>\n",
       "      <th>col_time_spend_company</th>\n",
       "      <th>col_Work_accident</th>\n",
       "      <th>col_left</th>\n",
       "      <th>col_promotion_last_5years</th>\n",
       "      <th>col_sales</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.38</td>\n",
       "      <td>0.53</td>\n",
       "      <td>2</td>\n",
       "      <td>157</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.80</td>\n",
       "      <td>0.86</td>\n",
       "      <td>5</td>\n",
       "      <td>262</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.11</td>\n",
       "      <td>0.88</td>\n",
       "      <td>7</td>\n",
       "      <td>272</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.72</td>\n",
       "      <td>0.87</td>\n",
       "      <td>5</td>\n",
       "      <td>223</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.37</td>\n",
       "      <td>0.52</td>\n",
       "      <td>2</td>\n",
       "      <td>159</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   col_salary_0  col_salary_1  col_salary_2  col_satisfaction_level  \\\n",
       "0           1.0          -1.0          -1.0                    0.38   \n",
       "1           1.0           1.0          -1.0                    0.80   \n",
       "2           1.0           1.0          -1.0                    0.11   \n",
       "3           1.0          -1.0          -1.0                    0.72   \n",
       "4           1.0          -1.0          -1.0                    0.37   \n",
       "\n",
       "   col_last_evaluation  col_number_project  col_average_montly_hours  \\\n",
       "0                 0.53                   2                       157   \n",
       "1                 0.86                   5                       262   \n",
       "2                 0.88                   7                       272   \n",
       "3                 0.87                   5                       223   \n",
       "4                 0.52                   2                       159   \n",
       "\n",
       "   col_time_spend_company  col_Work_accident  col_left  \\\n",
       "0                       3                  0         1   \n",
       "1                       6                  0         1   \n",
       "2                       4                  0         1   \n",
       "3                       5                  0         1   \n",
       "4                       3                  0         1   \n",
       "\n",
       "   col_promotion_last_5years col_sales  \n",
       "0                          0     sales  \n",
       "1                          0     sales  \n",
       "2                          0     sales  \n",
       "3                          0     sales  \n",
       "4                          0     sales  "
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "helmert_encoder = ce.HelmertEncoder(cols=['salary'])\n",
    "helmert_df = helmert_encoder.fit_transform(hr_data)\n",
    "helmert_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
